rm(list=setdiff(ls(), "twd")) 
library(tidycensus)
library(tigris)
library(terra)
library(dplyr)
library(wru)
library(stringr)
library(fixest)
library(AER)
library(sf)
options(tigris_use_cache = TRUE)

set.seed(24)

## Data from make_data.R ##
data<-read.csv(paste(twd,"data/made_data/vreg_wgeos_rep.csv",sep=""))

### Get Spatial features
coord <- data.frame(lat = data$lat_hom, lon=data$lon_hom,data$text_registrant_id)
points_vect <- vect(coord, geom = c("lon", "lat"), crs = "EPSG:4326")
## Get census tracs
### Spatial join to get each observation matched to the right census tract 
tracts<-vect(paste(twd,"data/GIS/tl_2020_47_tract/tl_2020_47_tract.shp",sep=""))
tracts <- project(tracts, crs(points_vect))
points_with_tract <-intersect(points_vect, tracts)
## This used to work with tidycensus but it stopped (thanks DOGE), so use the above w/ the GIS map of tracts
#tracts <- tracts(state = "TN", cb = TRUE, year = 2020)
#tracts <- st_transform(tracts, crs = st_crs(points_sf))
#points_with_tract <- st_join(points_sf, tracts, join = st_intersects)
points_with_tract<-data.frame(points_with_tract$data.text_registrant_id,points_with_tract$TRACTCE)
names(points_with_tract)<-c("text_registrant_id","tract_n")
data<-merge(data,points_with_tract,by="text_registrant_id",all=T)

############## Use the Khanna Imai method to impute race##################

race_lookup <- data.frame( text_registrant_id = data$text_registrant_id,
                           last_name = data$text_name_last,
                           first_name = data$text_name_first,
                           middle_name = data$text_name_middle,
                           tract =  data$tract_n,
                           sex = ifelse(data$cde_gender=="F",1,0),
                           age = round(data$age),
                           state = rep("TN",nrow(data)),
                           county = rep("037",nrow(data))
)


# Format data for wru
# `wru` expects certain column names: "first", "last", "zip"
race_prepped <-race_lookup %>%
  rename(text_registrant_id = text_registrant_id, surname = last_name,first = first_name, middle = middle_name, tract = tract, sex = sex, age = age,state=state,county=county)
# Run racial estimation
# Use the `predict_race` function from the `wru` package
CensusObj3 <- try(get_census_data(state = "TN", census.geo = "tract",year=2020, key = "d5ce735cbc077961fad12598c5a76aed654c9f97"))


race_result <- predict_race(
  voter.file = race_prepped,
  census.data = CensusObj3,
  sex = T,
  age = T,
  census.geo="tract",
  impute.missing = TRUE,
  surname.only = FALSE,
  use.counties = FALSE,
  model= "BISG",
  skip_bad_geos = TRUE,
  names.to.use = c("surname")
)


race_res<-race_result[,c("text_registrant_id","pred.whi","pred.bla","pred.his","pred.asi","pred.oth")]
## Merge into main data frame
data<-merge(race_res,data,by="text_registrant_id",all.y=T)
#data$cde_race
### make race dummies
data$race<-ifelse(data$pred.whi > .5, "White",NA)
data$race<-ifelse(data$pred.bla > .5, "Black",data$race)
data$race<-ifelse(data$pred.his > .5, "Hispanic",data$race)
data$race<-ifelse(data$pred.asi > .5, "Asian",data$race)
data$race<-ifelse(data$pred.oth > .5, "Other",data$race)
### do the same but match the description in the data 
data$race2<-ifelse(data$pred.whi > .5, "W   ",NA)
data$race2<-ifelse(data$pred.bla > .5, "B   ",data$race2)
data$race2<-ifelse(data$pred.his > .5, "H   ",data$race2)
data$race2<-ifelse(data$pred.asi > .5, "A   ",data$race2)
data$race2<-ifelse(data$pred.oth > .5, "O   ",data$race2)

## Classify the unclassified as NAs
## take respondents who gave their own race and keep that, use modeled for those for whom we only have the modeled (kahna imai) data
data$race_mod<-ifelse(data$cde_race =="", NA, data$cde_race)
data$race_mod<-ifelse(is.na(data$race_mod), data$race2, data$race_mod)
data$race_mod2<-NA
data$race_mod2<-ifelse(data$race_mod =="A   ", "Asian", data$race_mod2)
data$race_mod2<-ifelse(data$race_mod =="B   ", "Black", data$race_mod2)
data$race_mod2<-ifelse(data$race_mod =="W   ", "White", data$race_mod2)
data$race_mod2<-ifelse(data$race_mod =="H   ", "Hispanic", data$race_mod2)
data$race_mod2<-ifelse(data$race_mod =="O   "|data$race_mod=="I   ", "Other",data$race_mod2)



##### Use census data to model income 

census_api_key("d5ce735cbc077961fad12598c5a76aed654c9f97", install = TRUE,overwrite=T)
readRenviron("~/.Renviron")

income_vars <- c(
  white_median_income = "B19013A_001", # Median household income: White alone
  black_median_income = "B19013B_001", # Median household income: Black or African American alone
  asian_median_income = "B19013D_001", # Median household income: Asian alone
  hispanic_median_income = "B19013I_001" # Median household income: Hispanic or Latino
)

age_income_vars <- c(
  "B19049_002" = "Under_25",      # Householder under 25 years
  "B19049_003" = "Age_25_44",     # Householder 25 to 44 years
  "B19049_004" = "Age_45_64",     # Householder 45 to 64 years
  "B19049_005" = "Age_65_over"    # Householder 65 years and over
)


### Get income by race for each tract in the data

davidson_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Davidson",
  year = 2020,
  survey = "acs5"
)

franklin_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Franklin",
  year = 2020,
  survey = "acs5"
)

williamson_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Williamson",
  year = 2020,
  survey = "acs5"
)

hamilton_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Hamilton",
  year = 2020,
  survey = "acs5"
)

wilson_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Wilson",
  year = 2020,
  survey = "acs5"
)

madison_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Madison",
  year = 2020,
  survey = "acs5"
)


cheatham_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Cheatham",
  year = 2020,
  survey = "acs5"
)


loudon_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Loudon",
  year = 2020,
  survey = "acs5"
)


sumner_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Sumner",
  year = 2020,
  survey = "acs5"
)


dekalb_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "DeKalb",
  year = 2020,
  survey = "acs5"
)



robertson_race_income <- get_acs(
  geography = "tract",
  variables = income_vars,
  state = "TN",
  county= "Robertson",
  year = 2020,
  survey = "acs5"
)



### combine all race income tract data

davidson_race_income<-rbind(davidson_race_income,franklin_race_income,williamson_race_income,hamilton_race_income, dekalb_race_income, 
                            wilson_race_income,madison_race_income,cheatham_race_income,loudon_race_income,sumner_race_income, robertson_race_income)

### the tracts unavble to match
davidson_race_income$tract_n<-gsub("47037","",davidson_race_income$GEOID)
davidson_race_income$tract_n<-gsub("47051","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47187","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47065","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47189","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47113","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47021","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47105","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47165","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47041","",davidson_race_income$tract_n)
davidson_race_income$tract_n<-gsub("47147","",davidson_race_income$tract_n)



## get county average

davidson_race_income2 <- get_acs(
  geography = "county",
  variables = income_vars,
  state = "TN",
  county = "Davidson",
  year = 2020,
  survey = "acs5"
)

## ratios to white income
bla_ratio_county<-davidson_race_income2[2,"estimate"]/davidson_race_income2[1,"estimate"]
asi_ratio_county<-davidson_race_income2[3,"estimate"]/davidson_race_income2[1,"estimate"]
his_ratio_county<-davidson_race_income2[4,"estimate"]/davidson_race_income2[1,"estimate"]



### get income by age  for all tracts

davidson_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Davidson",
  year = 2020,
  survey = "acs5"
)

franklin_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Franklin",
  year = 2020,
  survey = "acs5"
)

williamson_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Williamson",
  year = 2020,
  survey = "acs5"
)

hamilton_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Hamilton",
  year = 2020,
  survey = "acs5"
)

wilson_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Wilson",
  year = 2020,
  survey = "acs5"
)

madison_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Madison",
  year = 2020,
  survey = "acs5"
)


cheatham_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Cheatham",
  year = 2020,
  survey = "acs5"
)


loudon_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Loudon",
  year = 2020,
  survey = "acs5"
)


sumner_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Sumner",
  year = 2020,
  survey = "acs5"
)


dekalb_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "DeKalb",
  year = 2020,
  survey = "acs5"
)



robertson_age_income <- get_acs(
  geography = "tract",
  variables = names(age_income_vars),
  state = "TN",
  county= "Robertson",
  year = 2020,
  survey = "acs5"
)



### all age income by tract

davidson_age_income<-rbind(davidson_age_income,franklin_age_income,williamson_age_income,hamilton_age_income, dekalb_age_income, 
                           wilson_age_income,madison_age_income,cheatham_age_income,loudon_age_income,sumner_age_income, robertson_age_income)

### unmerged tracts

davidson_age_income$tract_n<-gsub("47037","",davidson_age_income$GEOID)
davidson_age_income$tract_n<-gsub("47051","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47187","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47065","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47189","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47113","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47021","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47105","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47165","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47041","",davidson_age_income$tract_n)
davidson_age_income$tract_n<-gsub("47147","",davidson_age_income$tract_n)



### Davidson county income

davidson_age_income2 <- get_acs(
  geography = "county",
  variables = names(age_income_vars),
  state = "TN",
  county= "Davidson",
  year = 2020,
  survey = "acs5"
)


## unique tracts

tracts_dvds<-unique(data$tract_n)

## create race ratios

bla_ratio<-asi_ratio<-his_ratio<-rep(NA,length(tracts_dvds))

for(i in 1:length(tracts_dvds)){
  sub<-davidson_race_income[davidson_race_income$tract_n == tracts_dvds[i],]
  
  if(!is.na(sub[2,"estimate"])){
    bla_ratio[i]<-sub[2,"estimate"]/sub[1,"estimate"]
  } 
  
  if(is.na(sub[2,"estimate"])){ 
    bla_ratio[i]<-bla_ratio_county
  }
  
  if(!is.na(sub[3,"estimate"])){
    asi_ratio[i]<-sub[3,"estimate"]/sub[1,"estimate"]
  } 
  
  if(is.na(sub[2,"estimate"])){ 
    asi_ratio[i]<-asi_ratio_county
  }
  
  
  if(!is.na(sub[4,"estimate"])){
    his_ratio[i]<-sub[4,"estimate"]/sub[1,"estimate"]
  } 
  
  if(is.na(sub[2,"estimate"])){ 
    his_ratio[i]<-his_ratio_county
  }
  
}


### create age categories as in census data

data$age_group<-ifelse(data$age < 25, "age_lt_25" ,NA)
data$age_group<-ifelse(data$age >= 25 & data$age < 45 , "age_25_44" ,data$age_group)
data$age_group<-ifelse(data$age >= 45 & data$age < 65 , "age_45_65" ,data$age_group)
data$age_group<-ifelse(data$age >= 65, "age_gt_65" ,data$age_group)

### county age ratios

county_lt_25_adjust<-davidson_age_income2$estimate[1]/davidson_age_income2$estimate[2]
county_gt_65_adjust<-davidson_age_income2$estimate[4]/davidson_age_income2$estimate[3]
county_25_44_adjust<-davidson_age_income2$estimate[2]/davidson_age_income2$estimate[3]
county_45_65_adjust<-davidson_age_income2$estimate[3]/davidson_age_income2$estimate[2]




tracts_dvds<-unique(data$tract_n)
tracts_dvds<-tracts_dvds[!is.na(tracts_dvds)]
data$inc_estimate<-NA

### loop over tracts, create individual estimate based on age and then race ratios

for(i in 1:length(tracts_dvds)){
  sub_tract<-davidson_age_income[davidson_age_income$tract_n == tracts_dvds[i],]
  sub_tract<-sub_tract[1:4,]
  ## adjust by age
  for(q in 1:4){
    if(sum(is.na(sub_tract$estimate))==1){
      if(q==1 & is.na(sub_tract$estimate[q])){ sub_tract[q, "estimate"]<-sub_tract[2,"estimate"]*county_lt_25_adjust    }
      if(q==2 & is.na(sub_tract$estimate[q])){ sub_tract[q, "estimate"]<-sub_tract[3,"estimate"]*county_25_44_adjust    }
      if(q==3 & is.na(sub_tract$estimate[q])){ sub_tract[q, "estimate"]<-sub_tract[2,"estimate"]*county_45_65_adjust    }
      if(q==4 & is.na(sub_tract$estimate[q])){ sub_tract[q, "estimate"]<-sub_tract[3,"estimate"]*county_gt_65_adjust    } 
    }  
  }
  
  ## adjust by race 
  for(j in 1:nrow(data)){
    if(is.na(data$tract_n[j])){next}
    if(is.na(data$race_mod2[j])){next}
    if(data$tract_n[j] == tracts_dvds[i]){
      if(data$age_group[j] == "age_lt_25"){
        if(data$race_mod2[j]=="White"){
          data$inc_estimate[j]<-sub_tract[1,"estimate"]
        }
        if(data$race_mod2[j]=="Black"){
          data$inc_estimate[j]<-sub_tract[1,"estimate"]*bla_ratio[i]
        }
        if(data$race_mod2[j]=="Asian"){
          data$inc_estimate[j]<-sub_tract[1,"estimate"]*asi_ratio[i]
        }
        if(data$race_mod2[j]=="Hispanic"| data$race_mod2[j]=="Other"){
          data$inc_estimate[j]<-sub_tract[1,"estimate"]*his_ratio[i]
        }
      }
      
      
      if(data$age_group[j] == "age_25_44"){
        if(data$race_mod2[j]=="White"){
          data$inc_estimate[j]<-sub_tract[2,"estimate"]
        }
        if(data$race_mod2[j]=="Black"){
          data$inc_estimate[j]<-sub_tract[2,"estimate"]*bla_ratio[i]
        }
        if(data$race_mod2[j]=="Asian"){
          data$inc_estimate[j]<-sub_tract[2,"estimate"]*asi_ratio[i]
        }
        if(data$race_mod2[j]=="Hispanic"| data$race_mod2[j]=="Other"){
          data$inc_estimate[j]<-sub_tract[2,"estimate"]*his_ratio[i]
        }
        
      }
      
      
      if(data$age_group[j] == "age_45_65"){
        if(data$race_mod2[j]=="White"){
          data$inc_estimate[j]<-sub_tract[3,"estimate"]
        }
        if(data$race_mod2[j]=="Black"){
          data$inc_estimate[j]<-sub_tract[3,"estimate"]*bla_ratio[i]
        }
        if(data$race_mod2[j]=="Asian"){
          data$inc_estimate[j]<-sub_tract[3,"estimate"]*asi_ratio[i]
        }
        if(data$race_mod2[j]=="Hispanic"| data$race_mod2[j]=="Other"){
          data$inc_estimate[j]<-sub_tract[3,"estimate"]*his_ratio[i]
        }
        
        
      }
      
      
      if(data$age_group[j] == "age_gt_65"){
        if(data$race_mod2[j]=="White"){
          data$inc_estimate[j]<-sub_tract[4,"estimate"]
        }
        if(data$race_mod2[j]=="Black"){
          data$inc_estimate[j]<-sub_tract[4,"estimate"]*bla_ratio[i]
        }
        if(data$race_mod2[j]=="Asian"){
          data$inc_estimate[j]<-sub_tract[4,"estimate"]*asi_ratio[i]
        }
        if(data$race_mod2[j]=="Hispanic"| data$race_mod2[j]=="Other"){
          data$inc_estimate[j]<-sub_tract[4,"estimate"]*his_ratio[i]
        }
      }
      
      
      
    }
  }
  #print(i)
}

### create income estimate
data$inc_est<-as.numeric(as.vector(do.call(rbind,data$inc_estimate)))


## save data

saveRDS(data,paste(twd,"data/made_data/vreg_wgeos_race.rds",sep=""))